machine="mythinkpad" # define the machine we work on
loadALL = FALSE # only load CpG shared by half fish per trt group
loadannot = TRUE # load genome annotations
sourceDMS = TRUE # Load the calculated DMS
source("R02.3_DATALOAD.R")
## Warning: replacing previous import 'Biostrings::pattern' by 'grid::pattern' when
## loading 'genomation'
library(knitr)
knitr::opts_chunk$set(cache = TRUE, warning = FALSE, 
                      message = FALSE, cache.lazy = FALSE, cache.path = "../../gitignore/RmdCache/", # keep heavy data in gitignore cache
                      fig.path = "Rfigures/Rmd/")

1 Bottom up analysis: Differential Methylation in offspring

1.1 Differential methylation in all G2 (brother pair and sex as covariate)

1.1.1 TREATMENT EFFECT: Differential methylation between control and infected, within each parental treatment

## Features Annotation (use package genomation v1.24.0)
## NB Promoters are defined by options at genomation::readTranscriptFeatures function. 
## The default option is to take -1000,+1000bp around the TSS and you can change that. 
## -> following Heckwolf 2020 and Sagonas 2020, we consider 1500bp upstream and 500 bp downstream

## Parents comparison:
diffAnn_PAR = annotateWithGeneParts(as(DMSlist$DMS_15pc_G1_C_T,"GRanges"),annotBed12)
## Offspring from control parents comparison: 
diffAnn_G2_controlG1 = annotateWithGeneParts(as(DMSlist$DMS_15pc_CC_CT,"GRanges"),annotBed12)
## Offspring from infected parents comparison:
diffAnn_G2_infectedG1 = annotateWithGeneParts(as(DMSlist$DMS_15pc_TC_TT,"GRanges"),annotBed12)
par(mfrow=c(1,3))
par(mar = c(.1,0.1,5,0.1)) # Set the margin on all sides to 2
## Parents comparison:
diffAnn_PAR = annotateWithGeneParts(as(DMSlist$DMS_15pc_G1_C_T,"GRanges"),annotBed12)
diffAnn_PAR
##   promoter       exon     intron intergenic 
##       8.58      15.84      34.13      45.72 
##   promoter       exon     intron intergenic 
##       8.58      13.19      32.51      45.72 
## promoter     exon   intron 
##     1.07     0.19     0.44 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       3    3020    8128   15141   19226  300247
genomation::plotTargetAnnotation(diffAnn_PAR,precedence=TRUE, main="DMS G1", cex.legend = 1, border="white")

## Offspring from control parents comparison:
diffAnn_G2_controlG1 = annotateWithGeneParts(as(DMSlist$DMS_15pc_CC_CT,"GRanges"),annotBed12)
diffAnn_G2_controlG1
##   promoter       exon     intron intergenic 
##      11.28      21.05      30.16      44.44 
##   promoter       exon     intron intergenic 
##      11.28      16.21      28.07      44.44 
## promoter     exon   intron 
##     0.38     0.07     0.14 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      11    2602    6295   14212   15906  261017
genomation::plotTargetAnnotation(diffAnn_G2_controlG1,precedence=TRUE, main="DMS G2-G1c", cex.legend = 1, border="white")
## Offspring from infected parents comparison:
diffAnn_G2_infectedG1 = annotateWithGeneParts(as(DMSlist$DMS_15pc_TC_TT,"GRanges"),annotBed12)
diffAnn_G2_infectedG1
##   promoter       exon     intron intergenic 
##      12.90      13.62      34.64      46.81 
##   promoter       exon     intron intergenic 
##      12.90       9.42      30.87      46.81 
## promoter     exon   intron 
##     0.28     0.03     0.08 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       9    2355    7150   12285   14773  109527
genomation::plotTargetAnnotation(diffAnn_G2_infectedG1,precedence=TRUE, main="DMS G2-G1i", cex.legend = 1, border="white")

par(mfrow=c(1,1))
## Run the function to get DMS info
DMS_info_G1 <- myDMSinfo(DMSlist$DMS_15pc_G1_C_T, uniteCov6_G1_woSexAndUnknowChrOVERLAP)
DMS_info_G2_G1c_final <- myDMSinfo(DMSlist$DMS_15pc_CC_CT, uniteCov14_G2_woSexAndUnknowChrOVERLAP)
DMS_info_G2_G1i_final <- myDMSinfo(DMSlist$DMS_15pc_TC_TT,uniteCov14_G2_woSexAndUnknowChrOVERLAP)

NB Kostas’ results: “We found a total of 1,973 CpG sites out of 1,172,887 CpGs (0.17%) across the genome that showed at least 15% differential fractional methylation (differentially methylated site [DMS]; q < 0.01) between infected and uninfected fish”

Here: we obtain out a total of 1001880 CpG sites: 3648 (0.36%) showed at least 15% differential fractional methylation (differentially methylated site [DMS]; q < 0.01) between infected and uninfected fish in the parental group; 1197 (0.12%) in the offspring from control parents comparison; 690 (0.07%) in the offspring from infected parents comparison.

1.1.1.1 Intersections of DMS: Venn diagrams

## Chi2 test: are the number of DMS from G2-G1C and G2-G1I overlapping with DMSpar statistically different?

A=length(intersect(DMS_info_G1$DMS,DMS_info_G2_G1c_final$DMS))
B=length(DMS_info_G2_G1c_final$DMS)
C=length(intersect(DMS_info_G1$DMS,DMS_info_G2_G1i_final$DMS))
D=length(DMS_info_G2_G1i_final$DMS)

Observed=matrix(c(A, B-A, C, D-C),nrow=2)
Observed
##      [,1] [,2]
## [1,]  106   59
## [2,] 1091  631
chisq.test(Observed)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  Observed
## X-squared = 0.019909, df = 1, p-value = 0.8878
## not statistically different
## output Venn diagrams
allVenn <- ggVennDiagram(list("DMS G1" = DMS_info_G1$DMS, "DMS G2-c" = DMS_info_G2_G1c_final$DMS, "DMS G2-i" = DMS_info_G2_G1i_final$DMS), label_alpha = 0) +
  scale_fill_gradient(low="white",high = "red")
hypoVenn <- ggVennDiagram(list("DMS G1\nhypo" = DMS_info_G1$DMS[DMS_info_G1$direction %in% "hypo"],
                               "DMS G2-c\nhypo" = DMS_info_G2_G1c_final$DMS[DMS_info_G2_G1c_final$direction %in% "hypo"],
                               "DMS G2-i\nhypo" = DMS_info_G2_G1i_final$DMS[DMS_info_G2_G1i_final$direction %in% "hypo"]), label_alpha = 0) +
  scale_fill_gradient(low="white",high = "red")
hyperVenn <- ggVennDiagram(list("DMS G1\nhyper" = DMS_info_G1$DMS[DMS_info_G1$direction %in% "hyper"],
                                "DMS G2-c\nhyper" = DMS_info_G2_G1c_final$DMS[DMS_info_G2_G1c_final$direction %in% "hyper"],
                                "DMS G2-i\nhyper" = DMS_info_G2_G1i_final$DMS[DMS_info_G2_G1i_final$direction %in% "hyper"]), label_alpha = 0) +
  scale_fill_gradient(low="white",high = "red")

ggarrange(allVenn,
          ggarrange(hypoVenn, hyperVenn, ncol = 2, legend = "none"),
          nrow = 2, widths = c(.5,1))

1.1.1.1.1 Venn with annotated features
runHyperHypoAnnot <- function(){
  par(mfrow=c(2,3))
  par(mar = c(.1,0.1,5,0.1)) # Set the margin on all sides to 2
  ####### HYPO
  ## Parents comparison:
  A = annotateWithGeneParts(
    as(DMSlist$DMS_15pc_G1_C_T[DMS_info_G1$direction %in% "hypo",],"GRanges"),annotBed12)
  genomation::plotTargetAnnotation(A,precedence=TRUE, main="DMS G1\nhypo", 
                                   cex.legend = .4, border="white")
  ## Offspring from control parents comparison:
  B = annotateWithGeneParts(
    as(DMSlist$DMS_15pc_CC_CT[DMS_info_G2_G1c_final$direction %in% "hypo",],"GRanges"),annotBed12)
  genomation::plotTargetAnnotation(B,precedence=TRUE, main="DMS G2-G1c\nhypo", 
                                   cex.legend = .4, border="white")
  ## Offspring from infected parents comparison:
  C = annotateWithGeneParts(
    as(DMSlist$DMS_15pc_TC_TT[DMS_info_G2_G1i_final$direction %in% "hypo",],"GRanges"),annotBed12)
  genomation::plotTargetAnnotation(C,precedence=TRUE, main="DMS G2-G1i\nhypo", 
                                   cex.legend = .4, border="white")
  ####### HYPER
  ## Parents comparison:
  D = annotateWithGeneParts(
    as(DMSlist$DMS_15pc_G1_C_T[DMS_info_G1$direction %in% "hyper",],"GRanges"),annotBed12)
  genomation::plotTargetAnnotation(D,precedence=TRUE, main="DMS G1\nhyper", 
                                   cex.legend = .4, border="white")
  ## Offspring from control parents comparison:
  E = annotateWithGeneParts(
    as(DMSlist$DMS_15pc_CC_CT[DMS_info_G2_G1c_final$direction %in% "hyper",],"GRanges"),annotBed12)
  genomation::plotTargetAnnotation(E,precedence=TRUE, main="DMS G2-G1c\nhyper", 
                                   cex.legend = .4, border="white")
  ## Offspring from infected parents comparison:
  f = annotateWithGeneParts(
    as(DMSlist$DMS_15pc_TC_TT[DMS_info_G2_G1i_final$direction %in% "hyper",],"GRanges"),annotBed12)
  genomation::plotTargetAnnotation(f,precedence=TRUE, main="DMS G2-G1i\nhyper", 
                                   cex.legend = .4, border="white")
  par(mfrow=c(1,1))
  return(list(G1hypo=A, G2G1chypo=B, G2G1ihypo=C, G1hyper=D, G2G1chyper=E, G2G1ihyper=f))
}

myannot=runHyperHypoAnnot()

############################################################
## Venn diagram of overlapping features by their annotation:
table(rowSums(as.data.frame(myannot$G1hypo@members))) # NB: some positions are labelled with several features!
## 
##   0   1   2   3 
## 559 566  49   2
## as in MBE 2021: "giving precedence to the following order promoters, exons,
## introns, and intergenic regions when features overlapped"

myAnnotateDMS <- function(DMS, annot){
  ## sanity check
  if (nrow(DMS) != nrow(annot)){"STOP error in arguments"}
  DMS$pos <- paste(DMS$chr, DMS$start, DMS$end)
  ## NB as in MBE 2021: "giving precedence to the following order promoters, exons,
  ## introns, and intergenic regions when features overlapped"
  DMS$feature <- NA
  ## 1. promoters
  DMS$feature[which(annot$prom == 1)] = "promoter"
  ## 2. exons
  DMS$feature[which(annot$exon == 1 & annot$prom ==0)] = "exon"
  ## 3. intron
  DMS$feature[which(annot$intro == 1 & annot$exon == 0 & annot$prom ==0)] = "intron"
  ## 4. intergenic regions
  DMS$feature[which(annot$intro == 0 & annot$exon == 0 & annot$prom ==0)] = "intergenic"
  return(DMS)
}

DMSlist$DMS_15pc_G1_C_T = myAnnotateDMS(DMSlist$DMS_15pc_G1_C_T, as.data.frame(diffAnn_PAR@members))
DMSlist$DMS_15pc_G1_C_T_HYPO = myAnnotateDMS(DMSlist$DMS_15pc_G1_C_T[DMS_info_G1$direction %in% "hypo",],
                                             as.data.frame(myannot$G1hypo@members))
DMSlist$DMS_15pc_G1_C_T_HYPER = myAnnotateDMS(DMSlist$DMS_15pc_G1_C_T[DMS_info_G1$direction %in% "hyper",],
                                              as.data.frame(myannot$G1hyper@members))

DMSlist$DMS_15pc_CC_CT = myAnnotateDMS(DMSlist$DMS_15pc_CC_CT, as.data.frame(diffAnn_G2_controlG1@members))
DMSlist$DMS_15pc_CC_CT_HYPO = myAnnotateDMS(DMSlist$DMS_15pc_CC_CT[DMS_info_G2_G1c_final$direction %in% "hypo",],
                                            as.data.frame(myannot$G2G1chypo@members))
DMSlist$DMS_15pc_CC_CT_HYPER = myAnnotateDMS(DMSlist$DMS_15pc_CC_CT[DMS_info_G2_G1c_final$direction %in% "hyper",],
                                             as.data.frame(myannot$G2G1chyper@members))

DMSlist$DMS_15pc_TC_TT = myAnnotateDMS(DMSlist$DMS_15pc_TC_TT, as.data.frame(diffAnn_G2_infectedG1@members))
DMSlist$DMS_15pc_TC_TT_HYPO = myAnnotateDMS(DMSlist$DMS_15pc_TC_TT[DMS_info_G2_G1i_final$direction %in% "hypo",],
                                            as.data.frame(myannot$G2G1ihypo@members))
DMSlist$DMS_15pc_TC_TT_HYPER = myAnnotateDMS(DMSlist$DMS_15pc_TC_TT[DMS_info_G2_G1i_final$direction %in% "hyper",],
                                             as.data.frame(myannot$G2G1ihyper@members))

## Make Venn diagram for each feature
getFeatureDFHYPO <- function(myfeat){
  a = DMSlist$DMS_15pc_G1_C_T_HYPO$pos[DMSlist$DMS_15pc_G1_C_T_HYPO$feature %in% myfeat]
  b = DMSlist$DMS_15pc_CC_CT_HYPO$pos[DMSlist$DMS_15pc_CC_CT_HYPO$feature %in% myfeat]
  c = DMSlist$DMS_15pc_TC_TT_HYPO$pos[DMSlist$DMS_15pc_TC_TT_HYPO$feature %in% myfeat]
  return(list(a=a,b=b,c=c))
}

getFeatureDFHYPER <- function(myfeat){
  a = DMSlist$DMS_15pc_G1_C_T_HYPER$pos[DMSlist$DMS_15pc_G1_C_T_HYPER$feature %in% myfeat]
  b = DMSlist$DMS_15pc_CC_CT_HYPER$pos[DMSlist$DMS_15pc_CC_CT_HYPER$feature %in% myfeat]
  c = DMSlist$DMS_15pc_TC_TT_HYPER$pos[DMSlist$DMS_15pc_TC_TT_HYPER$feature %in% myfeat]
  return(list(a=a,b=b,c=c))
}

getVenn <- function(feat, direction){
  if (direction == "hypo"){
    ggVennDiagram(list(A = getFeatureDFHYPO(feat)[["a"]],
                       B = getFeatureDFHYPO(feat)[["b"]],
                       C = getFeatureDFHYPO(feat)[["c"]]), label_alpha = 0,
                  category.names = c(paste0("DMS G1\nhypo\n", feat), paste0("DMS G2-c\nhypo\n", feat), paste0("DMS G2-i\nhypo\n", feat))) +
      scale_fill_gradient(low="white",high = "red")
  } else if (direction == "hyper"){
    ggVennDiagram(list(A = getFeatureDFHYPER(feat)[["a"]],
                       B = getFeatureDFHYPER(feat)[["b"]],
                       C = getFeatureDFHYPER(feat)[["c"]]), label_alpha = 0,
                  category.names = c(paste0("DMS G1\nhyper\n", feat), paste0("DMS G2-c\nhyper\n", feat), paste0("DMS G2-i\nhyper\n", feat))) +
      scale_fill_gradient(low="white",high = "red")
  }
}
ggarrange(getVenn("promoter", "hypo"), getVenn("exon", "hypo"),
          getVenn("intron", "hypo"), getVenn("intergenic", "hypo"),
          nrow = 2, ncol = 2)

ggarrange(getVenn("promoter", "hyper"), getVenn("exon", "hyper"),
          getVenn("intron", "hyper"), getVenn("intergenic", "hyper"),
          nrow = 2, ncol = 2)

1.1.1.2 Manhattan plot of DMS

## Parents trt-ctrl
# load annotation
annot_PAR <- as.data.frame(diffAnn_PAR@members)
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_G1_C_T, annotFile = annot_PAR, GYgynogff = GYgynogff, 
                   mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G1 DMS")

## G2-G1c trt-ctrl
# load annotation
annot_G2_G1c <- as.data.frame(diffAnn_G2_controlG1@members)
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_CC_CT, annotFile = annot_G2_G1c, GYgynogff = GYgynogff, 
                   mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G2-G1c DMS")

## G2-G1i trt-ctrl
# load annotation
annot_G2_G1i <- as.data.frame(diffAnn_G2_infectedG1@members)
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_TC_TT, annotFile = annot_G2_G1i, GYgynogff = GYgynogff, 
                   mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G2-G1i DMS")

## Outliers in Manhattan plot: 15% diff + 2SD
outliers_G1_final <- which(abs(DMSlist$DMS_15pc_G1_C_T$meth.diff) > 15 + 2*sd(abs(DMSlist$DMS_15pc_G1_C_T$meth.diff)))
outliers_annot_G1 <- as.data.frame(diffAnn_PAR@members)[outliers_G1_final,]
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_G1_C_T[outliers_G1_final, ],
                   annotFile = outliers_annot_G1, GYgynogff = GYgynogff, 
                   mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G1 DMS")

outliers_G2_G1c_final <- which(abs(DMSlist$DMS_15pc_CC_CT$meth.diff) > 15 + 2*sd(abs(DMSlist$DMS_15pc_CC_CT$meth.diff)))
outliers_annot_G2_G1c <- as.data.frame(diffAnn_G2_controlG1@members)[outliers_G2_G1c_final,]
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_CC_CT[outliers_G2_G1c_final, ],
                   annotFile = outliers_annot_G2_G1c, GYgynogff = GYgynogff, 
                   mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G2-G1c DMS")

outliers_G2_G1i_final <- which(abs(DMSlist$DMS_15pc_TC_TT$meth.diff) > 15 + 2*sd(abs(DMSlist$DMS_15pc_TC_TT$meth.diff)))
outliers_annot_G2_G1i <- as.data.frame(diffAnn_G2_infectedG1@members)[outliers_G2_G1i_final,]
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_TC_TT[outliers_G2_G1i_final, ],
                   annotFile = outliers_annot_G2_G1i, GYgynogff = GYgynogff, 
                   mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G2-G1i DMS")

1.1.2 PARENTAL EFFECT: Differential methylation between control G2 (G1 infected or not) and between infected G2 (G1 infected of not)

1.2 Differential methylation by brother pair (sex as covariate)

  1. CC-TC = CONTROL fish (parent CvsT)
  2. CT-TT = TREATMENT fish (parent CvsT)
  3. CC-CT = fish from CONTROL parents (G2 CvsT)
  4. TC-TT = fish from TREATMENT parents (G2 CvsT)

TBC To be continued

## All in DMSBPlist
# 
# ## Add fam to bp
# names(DMSBPlist) <- paste0(plyr::join(data.frame(BP = names(DMSBPlist)),
#                                       unique(data.frame(BP = fullMetadata_OFFS$brotherPairID, 
#                                                         fam = fullMetadata_OFFS$Family)))[[2]], "_", names(DMSBPlist))

## Extract DMS (by position)
myPosList = lapply(DMSBPlist, lapply, function(x){paste(x$chr, x$end)})

## Find DMS present in at least 4 BP out of 8 (half):
get2keep = function(Compa){
  x <- lapply(myPosList, function(x){unlist(x[[paste0("DMS_15pc_BP_", Compa)]])})
  f <- table(unlist((x))) # each DMS present between 1 and 8 times
  tokeep <- names(f)[f >= 4]
  print(length(tokeep))
  ## Keep the DMS present in 4 families minimum
  DMSBPlist_INTER4 <- lapply(x, function(x){x[x %in% tokeep]})
  ## Reorder by family:
  DMSBPlist_INTER4 <- DMSBPlist_INTER4[names(DMSBPlist_INTER4)[order(names(DMSBPlist_INTER4))]]
  return(DMSBPlist_INTER4)
}

1.2.1 Attribute plot of these DMS in every BP

## Prepare df for complexUpset
getUpsetDF = function(i){ # for a given comparison
  A = get2keep(vecCompa[i]) 
  A2 = lapply(A, function(x){
    x = data.frame(x)    # vector of DMS as df
    names(x) = "DMS"    # name each CpG
    return(x)
  })
  ## Add BP name
  for (i in 1:length(names(A2))){
    A2[[i]]["BP"] = names(A2)[i]
  }
  # make a dataframe
  A2 = A2 %>% reduce(full_join, by = "DMS")
  # names column with BP id
  for (i in 2:ncol(A2)) {names(A2)[i] = unique(A2[!is.na(A2[i]), i])}
  # replace by 0 or 1 the DMS absence/presence
  A = data.frame(apply(A2[2:9], 2, function(x) ifelse(is.na(x), 0, 1)))
  # add DMS
  A$DMS = A2$DMS
  return(A)
}

## Vector of all 4 comparisons
vecCompa <- c("CC_TC", "CT_TT", "CC_CT", "TC_TT")
## Make upset plots
for (i in 1:4){
  df = getUpsetDF(i)
  print(ComplexUpset::upset(
    df,
    names(df)[1:8],
    width_ratio=0.1,
    sort_intersections_by=c('degree', 'cardinality'),
    queries=query_by_degree(
      df,  names(df)[1:8],
      params_by_degree=list(
        '1'=list(color='red', fill='red'),
        '2'=list(color='purple', fill='purple'),
        '3'=list(color='blue', fill='blue'),
        '4'=list(color='grey', fill='grey'),
        '5'=list(color='red', fill='red'),
        '6'=list(color='purple', fill='purple'),
        '7'=list(color='blue', fill='blue'),
        '8'=list(color='green', fill='green')
      ),
      only_components=c("intersections_matrix", "Intersection size")
    )))
}
## [1] 1099

## [1] 1257

## [1] 329

## [1] 341

1.2.2 Get annotation of these DMS present in at least 4 BP

plotManhattanGenes <- function(i){
  DMSvec = unique(unlist(get2keep(vecCompa[i])))
  # Annotate the DMS present in at least 4 BP: 
  # Change the vector into a methobject:
  df <- data.frame(chr=sapply(strsplit(DMSvec, " "), `[`, 1), 
                   start=sapply(strsplit(DMSvec, " "), `[`, 2),
                   end=sapply(strsplit(DMSvec, " "), `[`, 2)) 
  # get annotation
  anot4BP <- getAnnotationFun(makeGRangesFromDataFrame(df))
  length(anot4BP) # number of genes
  # Reorder chromosome
  anot4BP <- anot4BP %>% 
    mutate(chr = gsub("Gy_chr", "", seqid), chrom_nr = seqid %>% deroman(), chrom_order=factor(chrom_nr) %>% as.numeric()) %>% arrange(chrom_order)  %>%
    mutate(geneLengthkb = (end - start)/1000, nCpGperGenekb = nCpG/geneLengthkb)
  
  # Plot
  plot = ggplot(anot4BP, aes(x=start, y = nCpGperGenekb)) + geom_point(alpha=.4) +
    facet_grid(.~fct_inorder(chr)) +
    geom_hline(yintercept = 1)+
    theme(axis.text.x=element_blank()) +
    xlab(paste0("Genes with DMS present in at least 4 brother pairs\nComparison: ", vecCompa[i])) +
    ylab("Number of differentially methylated CpG per gene kb")
  
  ## Genes with at least 1 CpG differentially methylated per kb:
  topGenes = anot4BP[anot4BP$nCpGperGenekb >= 1,]
  return(list(plot = plot, anot4BP = anot4BP, topGenes = topGenes))
}

plotManhattanGenes(1)$plot
## [1] 1099

plotManhattanGenes(2)$plot
## [1] 1257

plotManhattanGenes(3)$plot
## [1] 329

plotManhattanGenes(4)$plot
## [1] 341

1.3 Which genes have at least 1 CpG differentially methylated per kb:

[1] 1099

comparison chr gene
CC_TC I Similar to mfsd1: Major facilitator superfamily domain-containing protein 1 (Danio rerio OX=7955)
CC_TC I Similar to Lim2: Lens fiber membrane intrinsic protein (Rattus norvegicus OX=10116)
CC_TC III Similar to SELENOF: Selenoprotein F (Homo sapiens OX=9606)
CC_TC IV Similar to MAP1LC3C: Microtubule-associated proteins 1A/1B light chain 3C (Homo sapiens OX=9606)
CC_TC IV Protein of unknown function
CC_TC V Similar to PRSS12: Neurotrypsin (Trachypithecus phayrei OX=61618)
CC_TC V Similar to hba2: Hemoglobin subunit alpha-2 (Anarhichas minor OX=65739)
CC_TC VI Similar to PROKR1: Prokineticin receptor 1 (Bos taurus OX=9913)
CC_TC VI Similar to CIAO2B: Cytosolic iron-sulfur assembly component 2B (Homo sapiens OX=9606)
CC_TC VII Protein of unknown function
CC_TC VII Similar to Kcnj6: G protein-activated inward rectifier potassium channel 2 (Mus musculus OX=10090)
CC_TC VIII Similar to GTF2IRD2: General transcription factor II-I repeat domain-containing protein 2A (Homo sapiens OX=9606)
CC_TC IX Similar to DGKE: Diacylglycerol kinase epsilon (Homo sapiens OX=9606)
CC_TC IX Protein of unknown function
CC_TC X Similar to Type-4 ice-structuring protein LS-12 (Myoxocephalus octodecemspinosus OX=68557)
CC_TC X Similar to kcnk9: Potassium channel subfamily K member 9 (Xenopus laevis OX=8355)
CC_TC XI Similar to CDC42EP1: Cdc42 effector protein 1 (Homo sapiens OX=9606)
CC_TC XII Similar to Tafa1: Chemokine-like protein TAFA-1 (Mus musculus OX=10090)
CC_TC XII Similar to Npbwr1: Neuropeptides B/W receptor type 1 (Mus musculus OX=10090)
CC_TC XIV Similar to rasgef1bb: Ras-GEF domain-containing family member 1B-B (Danio rerio OX=7955)
CC_TC XVI Similar to Me3: NADP-dependent malic enzyme, mitochondrial (Mus musculus OX=10090)
CC_TC XVI Protein of unknown function
CC_TC XVIII Similar to bmp2: Bone morphogenetic protein 2 (Tetraodon nigroviridis OX=99883)
CC_TC XVIII Similar to YY1: Transcriptional repressor protein YY1 (Homo sapiens OX=9606)
CC_TC XVIII Similar to FKBP3: Peptidyl-prolyl cis-trans isomerase FKBP3 (Bos taurus OX=9913)
CC_TC XVIII Similar to rsph9: Radial spoke head protein 9 homolog (Danio rerio OX=7955)
CC_TC XVIII Similar to TENT5A: Terminal nucleotidyltransferase 5A (Homo sapiens OX=9606)
CC_TC XX Similar to MAG: Myelin-associated glycoprotein (Homo sapiens OX=9606)
CC_TC XX Similar to vim: Vimentin (Oncorhynchus mykiss OX=8022)
CC_TC XXI Protein of unknown function

[1] 1257

comparison chr gene
CT_TT I Similar to OR11A1: Olfactory receptor 11A1 (Homo sapiens OX=9606)
CT_TT I Protein of unknown function
CT_TT II Similar to Hapln3: Hyaluronan and proteoglycan link protein 3 (Mus musculus OX=10090)
CT_TT III Similar to MNCb-2990: Uncharacterized protein C14orf119 homolog (Mus musculus OX=10090)
CT_TT III Protein of unknown function
CT_TT IV Similar to Ltc4s: Leukotriene C4 synthase (Rattus norvegicus OX=10116)
CT_TT IV Similar to ZNF518B: Zinc finger protein 518B (Homo sapiens OX=9606)
CT_TT IV Similar to MAP1LC3C: Microtubule-associated proteins 1A/1B light chain 3C (Homo sapiens OX=9606)
CT_TT IV Similar to CHRNA9: Neuronal acetylcholine receptor subunit alpha-9 (Gallus gallus OX=9031)
CT_TT IV Similar to cdpf1: Cysteine-rich DPF motif domain-containing protein 1 (Xenopus laevis OX=8355)
CT_TT V Similar to hba2: Hemoglobin subunit alpha-2 (Anarhichas minor OX=65739)
CT_TT VI Similar to NANP: N-acylneuraminate-9-phosphatase (Homo sapiens OX=9606)
CT_TT VI Similar to CIAO2B: Cytosolic iron-sulfur assembly component 2B (Homo sapiens OX=9606)
CT_TT VIII Protein of unknown function
CT_TT IX Similar to Fhdc1: FH2 domain-containing protein 1 (Mus musculus OX=10090)
CT_TT IX Similar to LGALSL: Galectin-related protein (Gallus gallus OX=9031)
CT_TT IX Similar to eif4a3: Eukaryotic initiation factor 4A-III (Danio rerio OX=7955)
CT_TT X Similar to Type-4 ice-structuring protein LS-12 (Myoxocephalus octodecemspinosus OX=68557)
CT_TT XI Similar to hoxb9a: Homeobox protein Hox-B9a (Takifugu rubripes OX=31033)
CT_TT XI Similar to fzd2: Frizzled-2 (Xenopus laevis OX=8355)
CT_TT XI Similar to ELFN2: Protein phosphatase 1 regulatory subunit 29 (Homo sapiens OX=9606)
CT_TT XI Protein of unknown function
CT_TT XII Similar to MTG2: Mitochondrial ribosome-associated GTPase 2 (Pongo abelii OX=9601)
CT_TT XII Similar to RIBC1: RIB43A-like with coiled-coils protein 1 (Bos taurus OX=9913)
CT_TT XII Similar to rpl10a: 60S ribosomal protein L10a (Ictalurus punctatus OX=7998)
CT_TT XII Similar to MAPK13: Mitogen-activated protein kinase 13 (Bos taurus OX=9913)
CT_TT XIV Similar to S100Z: Protein S100-Z (Homo sapiens OX=9606)
CT_TT XIV Protein of unknown function
CT_TT XVII Similar to FAM107B: Protein FAM107B (Homo sapiens OX=9606)
CT_TT XVIII Similar to Opn5: Opsin-5 (Mus musculus OX=10090)
CT_TT XVIII Similar to FKBP3: Peptidyl-prolyl cis-trans isomerase FKBP3 (Bos taurus OX=9913)
CT_TT XXI Similar to Phex: Phosphate-regulating neutral endopeptidase PHEX (Mus musculus OX=10090)
CT_TT XXI Similar to mc4r: Melanocortin receptor 4 (Danio rerio OX=7955)
CT_TT XXI Similar to acbd5a: Acyl-CoA-binding domain-containing protein 5A (Danio rerio OX=7955)

[1] 329

comparison chr gene
CC_CT I Similar to OR11A1: Olfactory receptor 11A1 (Homo sapiens OX=9606)
CC_CT IV Similar to MAP1LC3C: Microtubule-associated proteins 1A/1B light chain 3C (Homo sapiens OX=9606)
CC_CT IV Protein of unknown function
CC_CT IV Similar to cdpf1: Cysteine-rich DPF motif domain-containing protein 1 (Xenopus laevis OX=8355)
CC_CT V Similar to TRIM16: E3 ubiquitin-protein ligase TRIM16 (Pongo abelii OX=9601)
CC_CT V Similar to hba2: Hemoglobin subunit alpha-2 (Anarhichas minor OX=65739)
CC_CT VI Similar to CIAO2B: Cytosolic iron-sulfur assembly component 2B (Homo sapiens OX=9606)
CC_CT XI Similar to EPN3: Epsin-3 (Homo sapiens OX=9606)
CC_CT XIII Similar to plpp5: Phospholipid phosphatase 5 (Xenopus laevis OX=8355)
CC_CT XVIII Similar to CHGA: Chromogranin-A (Equus caballus OX=9796)
CC_CT XXI Similar to NSUN6: tRNA (cytosine(72)-C(5))-methyltransferase NSUN6 (Homo sapiens OX=9606)

[1] 341

comparison chr gene
TC_TT I Similar to Lim2: Lens fiber membrane intrinsic protein (Rattus norvegicus OX=10116)
TC_TT IV Similar to ZNF518B: Zinc finger protein 518B (Homo sapiens OX=9606)
TC_TT IV Similar to MAP1LC3C: Microtubule-associated proteins 1A/1B light chain 3C (Homo sapiens OX=9606)
TC_TT IV Protein of unknown function
TC_TT V Similar to ZNF691: Zinc finger protein 691 (Bos taurus OX=9913)
TC_TT VII Similar to Rnf167: E3 ubiquitin-protein ligase RNF167 (Mus musculus OX=10090)
TC_TT XIV Similar to Rpl28: 60S ribosomal protein L28 (Mus musculus OX=10090)
TC_TT XVI Protein of unknown function
TC_TT XVIII Similar to Mgat2: Alpha-1,6-mannosyl-glycoprotein 2-beta-N-acetylglucosaminyltransferase (Rattus norvegicus OX=10116)
TC_TT XX Similar to Cdcp1: CUB domain-containing protein 1 (Mus musculus OX=10090)

1.4 Gene Ontology analysis:

  • Gene universe: all genes which are present in the dataset
  • Gene sub-universe: all genes in DMS in the GOterm universe we only want genes with (1) DMS between G2 exposed to parasites, by brother pair, present in AT LEAST 4 BP/8 for which (2) CpGs were covered, and (3) which have a GOterm attributed
# create gene universe
gene_universe <- data.frame(
  subsetByOverlaps(GRanges(annotGff3), GRanges(uniteCov14_G2_woSexAndUnknowChrOVERLAP))) %>% # subselect covered CpGs
  filter(lengths(Ontology_term)!=0) %>% # rm non existing GO terms
  filter(type %in% "gene")  %>% # keep all the 7416 genes with GO terms
  dplyr::select(c("Name", "Ontology_term")) %>%
  mutate(go_linkage_type = "IEA") %>% #NB: IEA but not necessarily true, it's from Interproscan after Maker. Sticklebacks (biomart) have 82701 IEA and 63 ISS.
  relocate("Ontology_term","go_linkage_type","Name") %>%
  unnest(Ontology_term) %>% # one GO per line (was a list before in this column)
  data.frame()

# Create gene set collection
goFrame <- GOFrame(gene_universe, organism="Gasterosteus aculeatus")
goAllFrame <- GOAllFrame(goFrame)
gsc_universe <- GeneSetCollection(goAllFrame, setType = GOCollection())

IMPORTANT NOTE from Mel: why conditional hypergeometric test? The GO ontology is set up as a directed acyclic graph, where a parent term is comprised of all its child terms. If you do a standard hypergeometric, you might e.g., find ‘positive regulation of kinase activity’ to be significant. If you then test ‘positive regulation of catalytic activity’, which is a parent term, then it might be significant as well, but only because of the terms coming from positive regulation of kinase activity.

The conditional hypergeometric takes this into account, and only uses those terms that were not already significant when testing a higher order (parent) term.

makedfGO <- function(i){
  anot4BP = plotManhattanGenes(i)$anot4BP
  
  ## Create subuniverse:
  sub_universe <- gene_universe %>%
    subset(gene_universe$Name %in% unlist(anot4BP$Parent))
  
  ## Run conditional hypergeometric test:
  runTestHypGeom <- function(sub_universe, onto){
    ## Constructing a GOHyperGParams objects or KEGGHyperGParams objects from a GeneSetCollection
    
    # Use GOEnrich as a wrapper around GOStat for extra FDR comparison
    ## Does not solve all issues, but better than nothing.
    ## See: https://support.bioconductor.org/p/5571/
    
    ## To do (maybe?) add gene count in internal GOEnrich function
    GO_fdr <- joinGOEnrichResults(goEnrichTest(gsc=gsc_universe,
                                               gene.ids = as.vector(unique(sub_universe[["Name"]])),# genes in selected gene set
                                               univ.gene.ids = unique(gene_universe$Name),
                                               ontologies = onto, # A string for GO to use ("BP", "CC", or "MF")
                                               pvalue.cutoff = 0.05,
                                               cond = TRUE, # see note above
                                               test.dir = "over"),# over represented GO terms
                                  p.adjust.method = "fdr")
    
    
    ## Run hypergeometric test:
    return(GO_fdr)
  }
  
  ## TO DO: add extra pFDR correction cf Mel
  GO_MF <- runTestHypGeom(sub_universe = sub_universe, onto = "MF")
  GO_CC <- runTestHypGeom(sub_universe = sub_universe, onto = "CC")
  GO_BP <- runTestHypGeom(sub_universe = sub_universe, onto = "BP")
  
  # Merge the df MP and BP
  dfGO = rbind(GO_MF, GO_CC, GO_BP)
  
  dfGO = dfGO %>% mutate(Term = factor(x = GO.term, levels = GO.term),
                         Comp = factor(x = vecCompa[i], levels = vecCompa[i]))
  return(dfGO)
}

## For each comparison:
A=makedfGO(1); B=makedfGO(2); C=makedfGO(3); D=makedfGO(4)
## [1] 1099
## [1] 1257
## [1] 329
## [1] 341
dfGO = rbind(A, B, C, D)

# add type of comparison:
dfGO$group = "Different parental treatment"
dfGO$group[dfGO$Comp %in% c("CC_CT", "TC_TT")] = "Different offpring treatment"


# # to keep the comparison in order:
# dfGO$Comp = factor(dfGO$Comp, levels = unique(dfGO$Comp))
## To save:
# htmlReport(GO_MF_0633, file="../Routput/GO/GO_MF_0633.html")
# write.table(as.data.frame(summary(GO_MF_0633)),"../Routput/GO/GO_MF_0633.txt", sep="\t", row.names = F)
dfGO %>% ggplot(aes(x=Comp, y = factor(GO.name))) +
  geom_point(aes(color = p.value.adjusted))+#, size = Count)) +
  scale_color_gradient(name="P value", low = "red", high = "blue") +
  # scale_size_continuous(name = "Gene number", range = c(1, 10), breaks = c(1,5,15,25))+
  theme_bw() + ylab("") + xlab("Treatments comparison") +
  facet_grid(fct_inorder(GO.category)~group,scales="free",space = "free")

2 Top down analysis: Differential Methylation in parents -> how do they look in offspring?

##############################################################
#### I. Focus on CpG positions at parental (Ctrl-Inf) DMS ####
##############################################################

####################################################################################
#### Question: how are the beta values in the different groups at the parental DMS?##
####################################################################################
##############
## Prepare dataset
##############
PM_G1 <- getPMdataset(uniteCov = uniteCov6_G1_woSexAndUnknowChrOVERLAP, MD = fullMetadata_PAR, gener="parents")
PM_G2 <- getPMdataset(uniteCov = uniteCov14_G2_woSexAndUnknowChrOVERLAP, MD = fullMetadata_OFFS, gener="offspring")

head(PM_G1)
head(PM_G2)

table(fullMetadata_OFFS$trtG1G2, fullMetadata_OFFS$clutch.ID)

## What is the relative contribution of methylation to brother pair & paternal treatment?
## Test of VCA: variance component analysis https://cran.r-project.org/web/packages/VCA/vignettes/VCA_package_vignette.html

## Hypo
PM_G2_mean_hypo <- PM_G2[PM_G2$hypohyper %in% "hypo", ] %>% 
  group_by(brotherPairID, G1_trt, G2_trt, ID) %>% 
  dplyr::summarize(MeanBetaValue = mean(BetaValue, na.rm=TRUE)) %>% data.frame()

varPlot(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hypo, 
        MeanLine=list(var=c("G1_trt", "G2_trt"), 
                      col=c("white", "blue"), lwd=c(2,2)), 
        BG=list(var="G2_trt", col=paste0("gray", c(80, 90))),
        YLabel=list(cex = .8, text="Mean beta value at parDMS \n hypomethylated upon infection"))

myfitVCA_hypo <- fitVCA(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hypo) 

### Real values
trtEffect <- sum(myfitVCA_hypo$aov.tab[2:4, 5])
genEffect <- sum(myfitVCA_hypo$aov.tab[5:8, 5])
error <- sum(myfitVCA_hypo$aov.tab[9, 5])
realValHypoVCA <- data.frame(trtEffect=trtEffect, genEffect=genEffect,error=error)

### Randomisation
myrandomVCA <- function(df=PM_G2_mean_hypo){
  randomDF = df
  randomDF$G1_trt = sample(PM_G2_mean_hypo$G1_trt, replace = F)
  randomDF$G2_trt = sample(PM_G2_mean_hypo$G2_trt, replace = F)
  randomDF$brotherPairID = sample(PM_G2_mean_hypo$brotherPairID, replace = F)
  myfitVCA <- fitVCA(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = randomDF) 
  trtEffect <- sum(myfitVCA$aov.tab[2:4, 5])
  genEffect <- sum(myfitVCA$aov.tab[5:8, 5])
  error <- sum(myfitVCA$aov.tab[9, 5])
  return(data.frame(trtEffect=trtEffect, genEffect=genEffect,error=error))
}

randomHypoVCA = do.call(rbind, lapply(1:1000, function(x) {
  df=myrandomVCA(PM_G2_mean_hypo)
  df$rep=x
  return(df)}))

randomHypoVCA = melt(randomHypoVCA, id.vars = "rep")
# saveRDS(randomHypoVCA, file = "Rdata/randomHypoVCA.RDS")
randomHypoVCA <- readRDS(file = "Rdata/randomHypoVCA.RDS")
df2=reshape2::melt(realValHypoVCA)

sumDF <- randomHypoVCA %>% 
  group_by(variable) %>%
  dplyr::summarize(value = mean(value)) %>% data.frame()

ggplot(randomHypoVCA, aes(x=variable, y=value))+
  geom_boxplot()+
  geom_jitter(width=.1, alpha=.2)+
  geom_point(data = df2, col = "red", size = 6)+
  geom_text(data=sumDF, aes(label=round(value)), col="white")+
  geom_text(data = df2, aes(label=round(value)), col="white")+
  theme_cleveland()+
  ggtitle("VCA with bootstrap N=1000 at hypo-parDMS", subtitle = "red: observed values")

# estimate 95% confidence intervals, request CI for
# all variance components via 'VarVC=TRUE'
VCAinference(myfitVCA_hypo, VarVC=TRUE)

## Hyper
PM_G2_mean_hyper <- PM_G2[PM_G2$hypohyper %in% "hyper", ] %>% 
  group_by(brotherPairID, G1_trt, G2_trt, ID) %>% 
  dplyr::summarize(MeanBetaValue = mean(BetaValue, na.rm=TRUE)) %>% data.frame()

varPlot(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hyper, 
        MeanLine=list(var=c("G1_trt", "G2_trt"), 
                      col=c("white", "blue"), lwd=c(2,2)), 
        BG=list(var="G2_trt", col=paste0("gray", c(80, 90))),
        YLabel=list(cex = .8, text="Mean beta value at parDMS \n hypermethylated upon infection"))

myfitVCA_hyper <- fitVCA(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hyper) 

### Real values
trtEffect <- sum(myfitVCA_hyper$aov.tab[2:4, 5])
genEffect <- sum(myfitVCA_hyper$aov.tab[5:8, 5])
error <- sum(myfitVCA_hyper$aov.tab[9, 5])
realValHyperVCA <- data.frame(trtEffect=trtEffect, genEffect=genEffect,error=error)

### Randomisation
randomHyperVCA = do.call(rbind, lapply(1:1000, function(x) {
  df=myrandomVCA(PM_G2_mean_hyper)
  df$rep=x
  return(df)}))

randomHyperVCA = melt(randomHyperVCA, id.vars = "rep")
# saveRDS(randomHyperVCA, file = "Rdata/randomHyperVCA.RDS")
randomHyperVCA <- readRDS(file = "Rdata/randomHyperVCA.RDS")
df2=reshape2::melt(realValHyperVCA)

sumDF <- randomHyperVCA %>% 
  group_by(variable) %>%
  dplyr::summarize(value = mean(value)) %>% data.frame()

ggplot(randomHyperVCA, aes(x=variable, y=value))+
  geom_boxplot()+
  geom_jitter(width=.1, alpha=.2)+
  geom_point(data = df2, col = "red", size = 6)+
  geom_text(data=sumDF, aes(label=round(value)), col="white")+
  geom_text(data = df2, aes(label=round(value)), col="white")+
  theme_cleveland()+
  ggtitle("VCA with bootstrap N=1000 at hyper-parDMS", subtitle = "red: observed values")

# estimate 95% confidence intervals, request CI for
# all variance components via 'VarVC=TRUE'
VCAinference(myfitVCA_hypo, VarVC=TRUE)

################
## Hyper
PM_G2_mean_hyper <- PM_G2[PM_G2$hypohyper %in% "hyper", ] %>% 
  group_by(brotherPairID, G1_trt, G2_trt, ID) %>% 
  dplyr::summarize(MeanBetaValue = mean(BetaValue, na.rm=TRUE)) %>% data.frame()

varPlot(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hyper, 
        MeanLine=list(var=c("G1_trt", "G2_trt"), 
                      col=c("white", "blue"), lwd=c(2,2)), 
        BG=list(var="G2_trt", col=paste0("gray", c(80, 90))),
        YLabel=list(cex = .8, text="Mean beta value at parDMS \n hypermethylated upon infection"))

myfitVCA_hyper <- fitVCA(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hyper) 
print(myfitVCA_hyper, digits=4)
# estimate 95% confidence intervals, request CI for
# all variance components via 'VarVC=TRUE'
VCAinference(myfitVCA_hyper, VarVC=TRUE)

##############
## In parents
##############
parmod <- lmer(data = PM_G1, BetaValue ~ meth.diff.parentals : Treatment + (1|CpGSite) + (1|brotherPairID))

## check normality of residuals assumption
qqnorm(resid(parmod))
qqline(resid(parmod))

pred <- ggpredict(parmod, terms = c("meth.diff.parentals", "Treatment"))
plot(pred, add.data = T)+
  scale_color_manual(values = c("black", "red"))+
  scale_y_continuous(name = "Beta values")+
  scale_x_continuous(name = "Methylation difference between infected and control parents in percentage")+
  ggtitle("Predicted methylation ratio (Beta) values in parents\n as a function of differential methylation between exposed and control groups")+
  theme_bw()

##############
## Linear model: does the beta value of offspring at DMS depends on treatment Parent x Offspring?
##############
modFull <- lmer(BetaValue ~ (G1_trt * G2_trt):hypohyper + (1|CpGSite) + (1|Sex) + (1|brotherPairID),data = PM_G2, REML = F) # REML =F for model comparison
mod_noG1trt <- lmer(BetaValue ~ G2_trt:hypohyper + (1|CpGSite)+ (1|Sex) + (1|brotherPairID), data = PM_G2, REML = F)
mod_noG2trt <-lmer(BetaValue ~ G1_trt:hypohyper + (1|CpGSite) + (1|Sex) + (1|brotherPairID), data = PM_G2, REML = F)
mod_noInteractions <- lmer(BetaValue ~ (G1_trt + G2_trt):hypohyper + (1|CpGSite) + (1|Sex) + (1|brotherPairID), data = PM_G2, REML = F)
mod_noHypoHyper <- lmer(BetaValue ~ (G1_trt * G2_trt) + (1|CpGSite) + (1|Sex) + (1|brotherPairID), data = PM_G2, REML = F)

## check normality of residuals assumption
qqnorm(resid(modFull))
qqline(resid(modFull))

## Likelihood ratio tests for all variables:
lrtest(modFull, mod_noG1trt) # G1 trt is VERY VERY significant (LRT: χ² (4) = 1163.6, p < 0.001)
lrtest(modFull, mod_noG2trt) # G2 trt is VERY VERY significant (LRT: χ² (4) = 30.02, p < 0.001) NB that changed when brotherpair is used instead of family!
lrtest(modFull, mod_noInteractions) # interactions are significant (LRT: χ² (2) = 9.21, p < 0.01)
lrtest(modFull, mod_noHypoHyper) # hypo/hyper VERY VERY significant (LRT: χ² (4) = 1140, p < 0.001)

## Post-hoc tests between treatments
modFull <- lmer(BetaValue ~ (G1_trt * G2_trt):hypohyper + (1|CpGSite) + (1|Sex) + (1|brotherPairID),data = PM_G2)
modFull_emmeans <- emmeans(modFull, list(pairwise ~ (G1_trt:G2_trt):hypohyper), adjust = "tukey")
modFull_emmeans

P1 <- plot(modFull_emmeans, by = "hypohyper", comparisons = TRUE) +
  # coord_flip()+
  theme_bw() +
  ggtitle("Estimated marginal means of methylation ratio (beta)\n of offspring at parental DMS")+
  theme(legend.position = "none", axis.title.x = element_blank()) +
  scale_x_continuous("Beta value (methylation ratio)", limits = c(47,69.5))

## NB: Comparison arrows: https://cran.r-project.org/web/packages/emmeans/vignettes/xplanations.html
## two estimated marginal means (EMMs) differ significantly if, and only if, their respective comparison arrows do not overlap
## These comparison arrows are decidedly not the same as confidence intervals.
## Confidence intervals for EMMs are based on the statistical properties of the individual EMMs, whereas comparison arrows
## are based on the statistical properties of differences of EMMs.

## Add the PARENTAL DMS value
## Same test on ALL, G1 and G2 fish
modFullG1 <- lmer(BetaValue ~ G1_trt:hypohyper + (1|CpGSite) + (1|brotherPairID), data = PM_G1)

modFullG1_emmeans <- emmeans(modFullG1, list(pairwise ~ G1_trt:hypohyper), adjust = "tukey")
modFullG1_emmeans

P2 <- plot(modFullG1_emmeans, by = "hypohyper", comparisons = TRUE) +
  theme_bw() +
  ggtitle("Estimated marginal means of methylation ratio (beta)\n of parents at DMS")+
  theme(legend.position = "none", axis.title.x = element_blank()) +
  scale_x_continuous("Beta value (methylation ratio)", limits = c(47,69.5))

ggarrange(P2, P1, labels = c("A", "B"), ncol = 1, nrow = 2)

#####################################################################################
## Are the nbr of residuals methylation AT PARENTAL DMS different in the 4 G2 trt? ##
## (for hypo vs hypermeth)? ##
##############################

length(unique(PM_G1$CpGSite))# 3648 positions
PM_G1 %>% dplyr::count(ID)## NB: not all covered in all samples

length(unique(PM_G2$CpGSite[PM_G2$hypohyper %in% "hypo"]))# 1176 positions hypomethylated upon parental inf
length(unique(PM_G2$CpGSite[PM_G2$hypohyper %in% "hyper"]))# 2472 positions hypermethylated upon parental inf

myfun <- function(X){
  ## Calculate nbr of CpG hypo/hypermethylated per individual, and nbr of covered CpG:
  X <- X %>% group_by(ID, Treatment, brotherPairID, clutch.ID, Sex) %>%
    dplyr::summarise(ncov = n(),
                     hypoMeth = sum(BetaValue < 0.3),
                     hyperMeth = sum(BetaValue > 0.7)) %>% data.frame()
  # Calculate residuals of nbr of methCpG by nbr of covered CpG
  X$res_Nbr_methCpG_Nbr_coveredCpG_HYPO = residuals(lm(X$hypoMeth ~ X$ncov))
  X$res_Nbr_methCpG_Nbr_coveredCpG_HYPER = residuals(lm(X$hyperMeth ~ X$ncov))
  
  ## Statistical model: is it different in each offspring trt group?
  mod1 <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPO ~ Treatment + (1|brotherPairID/clutch.ID) + (1|Sex),
               data = X, REML = F)
  mod0 <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPO ~ 1 + (1|brotherPairID/clutch.ID) + (1|Sex),
               data = X, REML = F)
  print(lrtest(mod1, mod0))
  
  ## Post-hoc test:
  modhypo <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPO ~ Treatment + (1|brotherPairID/clutch.ID) + (1|Sex),
                  data = X)
  ## pairwise posthoc test
  print(emmeans(modhypo, list(pairwise ~ Treatment), adjust = "tukey"))
  
  mod3 <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPER ~ Treatment + (1|brotherPairID/clutch.ID) + (1|Sex),
               data = X, REML = F)
  mod4 <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPER ~ 1 + (1|brotherPairID/clutch.ID) + (1|Sex),
               data = X, REML = F)
  print(lrtest(mod3, mod4))
  
  ## Post-hoc test:
  modhyper <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPER ~ Treatment + (1|brotherPairID/clutch.ID) + (1|Sex),
                   data = X)
  ## pairwise posthoc test
  print(emmeans(modhyper, list(pairwise ~ Treatment), adjust = "tukey"))
  
  ## Plot
  phypo <- plot(ggpredict(modhypo, terms = c("Treatment")), add.data = TRUE)+
    scale_y_continuous("Residuals of number of hypomethylated methylated \ncytosines on number of cytosines covered") +
    ggtitle("Predicted residuals nbr of hypomethylated CpG")+
    theme_bw()
  
  phyper <- plot(ggpredict(modhyper, terms = c("Treatment")), add.data = TRUE)+
    scale_y_continuous("Residuals of number of hypermethylated methylated \n cytosines on number of cytosines covered") +
    ggtitle("Predicted residuals nbr of hypermethylated CpG")+
    theme_bw()
  return(list(phypo, phyper))
}

listplots <- myfun(X = PM_G2[PM_G2$hypohyper %in% "hypo",])
## NOT significant
annotate_figure(ggarrange(listplots[[1]], listplots[[2]],ncol = 2, nrow = 1),
                top = text_grob("Parental DMS hypomethylated upon infection, in offspring"))

listplots <- myfun(X = PM_G2[PM_G2$hypohyper %in% "hyper",])
## Treatment SIGNIFICANT in both excess hypo/hyper methylation **

# $`pairwise differences of Treatment`
## HYPO
# 1                       estimate   SE   df t.ratio p.value
# NE_control - E_control     23.71 7.28 10.3   3.257  0.0353
# NE_control - E_exposed     26.88 7.26 10.3   3.701  0.0172

## HYPER
# 1                       estimate   SE   df t.ratio p.value
# NE_control - E_control    -24.06 7.36 10.3  -3.269  0.0348
# NE_control - E_exposed    -27.07 7.34 10.3  -3.687  0.0177

annotate_figure(ggarrange(listplots[[1]], listplots[[2]],ncol = 2, nrow = 1),
                top = text_grob("Parental DMS hypermethylated upon infection, in offspring"))

## The beta values in parentalDMS in offspring follow the parental pattern hypo/hyper methylated upon infection

######################################################################################
## II. CORE DMS: Which of the parental DMS are also found in offspring comparisons? ##
######################################################################################
intersect(paste(DMSlist$DMS_15pc_G1_C_T$chr, DMSlist$DMS_15pc_G1_C_T$start),
          intersect(paste(DMSlist$DMS_15pc_CC_CT$chr, DMSlist$DMS_15pc_CC_CT$start),
                    paste(DMSlist$DMS_15pc_TC_TT$chr, DMSlist$DMS_15pc_TC_TT$start)))
## ONLY 4!!! "Gy_chrII 22196179"  "Gy_chrXII 10863858" "Gy_chrXVII 2658079" "Gy_chrXX 5344222" 

###############################################################
## CORE DMS: Which of the DMS are common to CC-CI and IC-II? ##
###############################################################
makeManP <- function(comp1, comp2){
  A <- methylKit::select(DMS1, which(paste(DMS1$chr, DMS1$start) %in% coreDMS))
  B <- as.data.frame(annotateWithGeneParts(as(A,"GRanges"),annotBed12)@members)
  A2 <- methylKit::select(DMS2, 
                          which(paste(DMS2$chr, DMS2$start) %in% coreDMS))
  B2 <- as.data.frame(annotateWithGeneParts(as(A2,"GRanges"),annotBed12)@members)
  
  ggarrange(
    makeManhattanPlots(DMSfile = DMS1, 
                       annotFile = as.data.frame(annotateWithGeneParts(as(DMS1,"GRanges"),annotBed12)@members),
                       GYgynogff = GYgynogff, mycols = c("red", "grey", "black", "green"), 
                       mytitle = paste0("Manhattan plot of ", comp1, " DMS")),
    makeManhattanPlots(DMSfile = DMS2, 
                       annotFile = as.data.frame(annotateWithGeneParts(as(DMS2,"GRanges"),annotBed12)@members),
                       GYgynogff = GYgynogff, mycols = c("red", "grey", "black", "green"), 
                       mytitle = paste0("Manhattan plot of ", comp2, " DMS")),
    makeManhattanPlots(DMSfile = A, annotFile = B, GYgynogff = GYgynogff, 
                       mycols = c("red", "grey", "black", "green"), mytitle = paste0("Manhattan plot core DMS in ", comp1)),
    makeManhattanPlots(DMSfile = A2, annotFile = B2, GYgynogff = GYgynogff, 
                       mycols = c("red", "grey", "black", "green"), mytitle = paste0("Manhattan plot core DMS in ", comp2)),
    labels = c("A", "B", "C", "D"), ncol = 1, nrow = 4, common.legend = T)
}

DMS1 = DMSlist$DMS_15pc_CC_CT # from: getDiffMeth(myuniteCov = uniteCov14_G2_woSexAndUnknowChr_G1CONTROL, myMetadata = fullMetadata_OFFS[fullMetadata_OFFS$trtG1G2_NUM %in% c(5,6),])
DMS2 = DMSlist$DMS_15pc_TC_TT # from: getDiffMeth(myuniteCov = uniteCov14_G2_woSexAndUnknowChr_G1INFECTED, myMetadata = fullMetadata_OFFS[fullMetadata_OFFS$trtG1G2_NUM %in% c(2,3),])
coreDMS <- intersect(paste(DMS1$chr, DMS1$start), paste(DMS2$chr, DMS2$start))

## Make Manhattan plot:
makeManP(comp1 = "CC-CI", comp2 = "IC-II")

## Annotation of the core DMS:
coreDMSmethylDiff <- methylKit::select(DMS1, which(paste(DMS1$chr, DMS1$start) %in% coreDMS))

## Differentially methylated sites:
subGOterms = getAnnotationFun(METHOBJ = coreDMSmethylDiff)

## Background annotations:
universeGOterms = getAnnotationFun(METHOBJ = uniteCov14_G2_woSexAndUnknowChrOVERLAP)

length(universeGOterms)# 16024


# as.vector(lapply(strsplit(as.character(TSSAssociation_DiffMeth15p$feature.name), "\\."), "[", 1))

# Using the genes with associated pop-DMS, we applied
# a conditional hypergeometric Gene Ontology (GO) term enrichment
# analysis (false discovery rate–corrected P ≤ 0.05) with the Ensembl
# stickleback annotation dataset “gaculeatus_gene_ensembl,” and all
# genes that were associated to any sequenced CpG site were used as
# universe. We identified overrepresented biological processes, molec-
#   ular functions, and cellular components using the packages GOstats
# version 2.5 (59) and GSEABase version 1.46 (60) and corrected for
# multiple testing using the false discovery rate method implemented
# in goEnrichment version 1.0 (61) in R version 3.6 (52). Figures were
# produced using ggplot2 version 3.2 (56)

#######################################################################
## TRANSMITTED DMS: Which of the DMS are found in CCvsIC and CIvsII? ##
#######################################################################
makeManP(DMS1 = DMS15pc_G1_controlG2_half, comp1 = "CC-IC",
         DMS2 = DMS15pc_G1_infectedG2_half, comp2 = "CI-II")

##############
## Plot mean Beta values of offsprings at parental DMS, per trt, along the chromosomes
##############
meanBeta_G2_simple <- PM_G2 %>% group_by(Chr, Pos, Treatment) %>%
  dplyr::summarize(Mean = mean(BetaValue, na.rm=TRUE))
names(meanBeta_G2_simple) <- c("Chr", "Pos", "Treatment_G2", "MeanBetaG2")

genome <- GYgynogff %>%
  mutate(chrom_nr=chrom %>% deroman(), chrom_order=factor(chrom_nr) %>% as.numeric()) %>% 
  arrange(chrom_order) %>%
  mutate(gstart=lag(length,default=0) %>% cumsum(), gend=gstart+length, 
         type=LETTERS[2-(chrom_order%%2)],   gmid=(gstart+gend)/2)

mydata = tibble(trt=meanBeta_G2_simple$Treatment_G2,
                chrom=meanBeta_G2_simple$Chr, pos=meanBeta_G2_simple$Pos, beta=meanBeta_G2_simple$MeanBetaG2)
mydata$pos <- as.numeric(mydata$pos)
mydata$pos <- as.numeric(mydata$pos)

table(mydata$chrom)## check that chrXIX and chrUN are well removed!!

# join DMS and genomic position
data = dplyr::left_join(mydata, genome) %>% dplyr::mutate(gpos=pos+gstart)

# plot:
ggplot()+
  geom_rect(data=genome,aes(xmin=gstart,xmax=gend,ymin=-Inf,ymax=Inf,fill=type), alpha=.2)+
  geom_point(data=data, aes(x=gpos,y=beta, shape=trt, col=trt),fill="white", size = 1)+
  scale_color_manual(values = colOffs)+
  scale_shape_manual(values=c(21,21,21,21))+
  scale_fill_manual(values=c(A=rgb(.9,.9,.9),B=NA),guide="none")+
  scale_x_continuous(breaks=genome$gmid,labels=genome$chrom %>% str_remove(.,"Gy_chr"),
                     position = "top",expand = c(0,0))+
  theme_minimal()+
  theme(panel.grid = element_blank(),
        axis.line=element_blank(),
        axis.title = element_blank(),
        strip.placement = "outside")+
  facet_grid(trt~.)+
  ggtitle("Mean methylation proportions at the 3648 parental DMS for each offspring group")

###########################################################################################
## Not conclusive: test hyp that beta value is different (higher?) in the center of the chromosome, 
## where there are less recombinations. Test for each chromosome
meanBeta_G2_extended <- PM_G2 %>% group_by(Chr, Pos, Treatment, Sex, brotherPairID) %>%
  dplyr::summarize(Mean = mean(BetaValue, na.rm=TRUE))
names(meanBeta_G2_extended) <- c("Chr", "Pos", "Treatment_G2","Sex", "brotherPairID", "MeanBetaG2")

mydata = tibble(trt=meanBeta_G2_extended$Treatment_G2, sex = meanBeta_G2_extended$Sex, brotherPairID = meanBeta_G2_extended$brotherPairID,
                chrom=meanBeta_G2_extended$Chr, pos=meanBeta_G2_extended$Pos, beta=meanBeta_G2_extended$MeanBetaG2)
mydata$pos <- as.numeric(mydata$pos)

# join DMS and genomic position
data = dplyr::left_join(mydata, genome) %>% dplyr::mutate(gpos=pos+gstart)

## Add distance to center
data$dist2mid <- abs(data$gmid - data$gpos)

mod <- lmer(beta ~ dist2mid:chrom + (1|trt) + (1|sex) + (1|brotherPairID), data = data, REML = F)
mod0 <- lmer(beta ~ dist2mid + (1|trt) + (1|sex) + (1|brotherPairID), data = data, REML = F)
mod00 <- lmer(beta ~ chrom + (1|trt) + (1|sex) + (1|brotherPairID), data = data, REML = F)

lrtest(mod, mod0) # chromosome matters
lrtest(mod0, mod00) # distance to middle matters

## check normality of residuals assumption
qqnorm(resid(mod))
qqline(resid(mod)) # quite skewed

pred <- ggpredict(mod, terms = c("dist2mid","chrom"))
plot(pred) +
  scale_color_manual(values = 1:20)